{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a92df66b-68c9-4288-b881-45d1fd948c18",
   "metadata": {},
   "source": [
    "### Week 1 Contribution: Selenium-enhanced Website Summarizer\n",
    "This notebook attempts to summarize content from any website using a BeautifulSoup-first strategy with a Selenium fallback for JavaScript-heavy pages. Llama 3.2 is used to generate a markdown-formatted summary.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "407ea4b4-7c1b-4f94-a48d-f3ee3273bc61",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import Markdown,display\n",
    "from openai import OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "040e97a8-9a5f-4903-9d0e-fa19bb719b4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL=\"llama3.2\"\n",
    "openai=OpenAI(base_url=\"http://localhost:11434/v1\",api_key=\"ollama\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cac3c9ae-31ce-45b1-bbc1-70577a198e84",
   "metadata": {},
   "outputs": [],
   "source": [
    "message=\"Hi, write a snarky poem for me.\" \n",
    "response=openai.chat.completions.create(\n",
    "    model=MODEL,\n",
    "    messages=[{\n",
    "        \"role\":\"user\",\n",
    "        \"content\":message\n",
    "    }]\n",
    ")\n",
    "print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a27514f6-d7a5-4292-b98b-dc166416a2fc",
   "metadata": {},
   "source": [
    "### Beautiful Soup Version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "678901b6-5da1-4df7-8b73-a1c69dc758b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = {\n",
    " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
    "} # to make sure we're not blocked as bots from websites\n",
    "\n",
    "class bsWebsite:\n",
    "    \"\"\"\n",
    "    Attributes:\n",
    "        url (str): The URL of the page\n",
    "        title (str): The title of the page\n",
    "        text (str): The readable text from the page\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self,url):\n",
    "        self.url=url\n",
    "        response=requests.get(url,headers=headers) # gets the content of the page in response variable\n",
    "\n",
    "        soup=BeautifulSoup(response.content,'html.parser') # content of response is accessed using html parser for structure\n",
    "        self.title=soup.title.string if soup.title else \"No title\"\n",
    "\n",
    "        for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
    "            irrelevant.decompose()\n",
    "\n",
    "        self.text=soup.body.get_text(separator='\\n',strip=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a1a5ddd-7907-46fd-a1b7-ceeb876262f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "ed = bsWebsite(\"https://edwarddonner.com\")\n",
    "\n",
    "print(ed.url)\n",
    "print(ed.text)\n",
    "print(ed.title)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b7e965e4-7d20-4980-8cb2-871b8ca63c45",
   "metadata": {},
   "source": [
    "#### Now, let's create a detailed summary for how selenium works using what we just made"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b71a05c6-669b-4632-aeb9-b51daa4429a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "sel=bsWebsite(\"https://www.geeksforgeeks.org/software-engineering/selenium-webdriver-tutorial/\")\n",
    "print(sel.url)\n",
    "print(sel.title)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c463c67-2a9c-4fcd-99aa-cab0e2cdf936",
   "metadata": {},
   "outputs": [],
   "source": [
    "def user_prompt_for(web):\n",
    "    user_prompt=f\"\"\"You are looking at a website called {web.title}. \n",
    "    Provide a detailed summary of the given content and the concepts in markdown:\\n[{web.text}]\"\"\"\n",
    "\n",
    "    return user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2118ac4-3355-4f90-b799-ba375ceeafc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_prompt=\"\"\"You are an assistant that analyses the contents of a website based on request of user, \n",
    "while ignoring text that is navigation related. Respond in markdown.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "716b3772-3c73-4010-b089-8bc374cab9de",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(user_prompt_for(ed))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b23b39b4-78a3-4694-8c89-f2ce56b628f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_prompt=user_prompt_for(sel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce29c83c-7b47-43a8-8f92-c2a1aa36f8f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "messages=[\n",
    "    { \"role\":\"system\", \"content\":system_prompt},\n",
    "    { \"role\":\"user\", \"content\":user_prompt}\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f120702-029e-4c1a-8ffb-2c4944110aa8",
   "metadata": {},
   "outputs": [],
   "source": [
    "response=openai.chat.completions.create(model=MODEL,messages=messages)\n",
    "\n",
    "print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9326415-6d35-4750-b9b1-1ae83a86d6f7",
   "metadata": {},
   "source": [
    "### Selenium Version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba86d4cc-cf4c-4f75-aa57-4126b15463b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# making sure we're in the virtual environment\n",
    "import sys\n",
    "print(sys.executable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ba86dfa-1e91-4535-9c93-3838c46aee52",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01771002-b10f-4681-8710-0f1515866c92",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install webdriver-manager"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c19b582d-a355-4c20-8028-42a802e7dca5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.edge.service import Service\n",
    "# for edge only:\n",
    "from webdriver_manager.microsoft import EdgeChromiumDriverManager"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "978ab0b9-b42b-4136-8383-79b3f84e084b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# works for edge only. Do not close the window that pops up as t will be used to open sites given.\n",
    "driver=webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7dfdeb48-562e-44d3-9044-157d616835fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# creating a similar class as bsWebsie but using selenium\n",
    "class SelWebsite:\n",
    "\n",
    "    def __init__(self,url,driver):\n",
    "        self.driver=driver\n",
    "        self.driver.get(url)\n",
    "        \n",
    "        self.url=self.driver.current_url\n",
    "        self.title=self.driver.title\n",
    "        self.text=self.driver.find_element(By.TAG_NAME,\"body\").text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6174105d-c123-4032-afa8-75588c0f1133",
   "metadata": {},
   "outputs": [],
   "source": [
    "# testing it on OpenAI website\n",
    "gpt=SelWebsite(\"https://openai.com\",driver)\n",
    "print(gpt.url)\n",
    "print(gpt.driver)\n",
    "print(gpt.title)\n",
    "print(gpt.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bde84abf-09dd-4a56-b6a7-4e5a34c1098e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "b7208f3f-6245-48a4-a5ae-d0b59550ee28",
   "metadata": {},
   "source": [
    "##### Troubleshooting in case of errors:\n",
    "1. Make sure the window popped up wasn't closed.\n",
    "2. If the below cell results in any text except an error - driver ID is valid. In this case, quit and restart the driver again.\n",
    "3. If driver ID is invalid, activate driver again using below cells."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30afa4d1-1ce6-4bad-820e-b72cf3eef959",
   "metadata": {},
   "outputs": [],
   "source": [
    "# use the following code to check for valid session ID for driver if error occurs:\n",
    "print(driver.session_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "154ace93-47b2-40ea-9d49-c6c598a67144",
   "metadata": {},
   "outputs": [],
   "source": [
    "# if above is valid but still results in trouble, run both; otherwise run only the second part:\n",
    "# driver.quit()\n",
    "# driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07e74ec5-fda6-462f-b929-7d173b0bdb31",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(user_prompt_for(gpt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5d0fd2e-949a-4358-b963-1395157618d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "messages2=[\n",
    "    {\"role\":\"system\",\"content\":system_prompt},\n",
    "    {\"role\":\"user\",\"content\":user_prompt_for(gpt)}\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db457f5c-e1be-4087-932d-25ba4880b3ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "response=openai.chat.completions.create(model=MODEL,messages=messages2)\n",
    "\n",
    "print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d448018f-f363-4af9-8ae3-88cc4408da91",
   "metadata": {},
   "source": [
    "### Now let's build a summarize function which can be called directly to summarize any site."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "690ca16b-4b9c-4ddc-b21e-1e69b1d3135a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarize(site_url):\n",
    "    \"\"\"\n",
    "    Summarizes the visible content of a website.\n",
    "    - Tries BeautifulSoup parsing first (bsWebsite)\n",
    "    - Falls back to Selenium parsing (SelWebsite) if BS4 fails\n",
    "    - Uses llama3.2 to generate a summary in Markdown\n",
    "    \"\"\"\n",
    "    try:\n",
    "        site=bsWebsite(site_url)\n",
    "    except Exception as e:\n",
    "        print(f\"BS4 failed: {e}\\nTrying Selenium...\\n\")\n",
    "        site=SelWebsite(site_url,driver)\n",
    "\n",
    "    messages3=[\n",
    "        {\"role\":\"system\",\"content\":system_prompt},\n",
    "        {\"role\":\"user\",\"content\":user_prompt_for(site)}\n",
    "    ]\n",
    "\n",
    "    print(f\"\\nSummarizing: {site.title}\\nURL: {site.url}\\n\")\n",
    "\n",
    "    response=openai.chat.completions.create(model=MODEL,messages=messages3)\n",
    "\n",
    "    print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2744296c-ebbd-4696-8517-d14234af9a65",
   "metadata": {},
   "outputs": [],
   "source": [
    "summarize(\"https://www.udemy.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d0d2379-c8b3-4900-8671-179303c00929",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
